package com.zhen.lucene;

import java.util.Arrays;
import java.util.List;

public class TfIdfCal {


    public static void main(String[] args) {
        List<String> doc1 = Arrays.asList("人工","智能","成为","互联网","大会","焦点");
        List<String> doc2 = Arrays.asList("谷歌","推出","开源","人工","智能","系统","工具");
        List<String> doc3 = Arrays.asList("互联网","的","未来","在","人工","智能");
        List<String> doc4 = Arrays.asList("谷歌","开源","机器","学习","工具");

        List<List<String>> documents = Arrays.asList(doc1,doc2,doc3,doc4);

        TfIdfCal calculator = new TfIdfCal();
        System.out.println(calculator.tf(doc2,"谷歌"));
        System.out.println(calculator.df(documents,"谷歌"));
        double tfIdf = calculator.tfIdf(doc2,documents,"谷歌");
        System.out.println("TF-IDF (谷歌) = " + tfIdf);

    }

    /**
     * 统计词频tf
     * @param doc 文档词项集合
     * @param term 词项
     * @return 词频
     */
    public double tf(List<String> doc,String term){
        double termFrequency = 0;
        for(String str : doc){
            if(str.equalsIgnoreCase(term)){
                termFrequency++;
            }
        }
        return termFrequency / doc.size();
    }

    /**
     * 统计文档频率df
     * @param docs 文档集
     * @param term 词项
     * @return 文档频率df
     */
    public int df(List<List<String>> docs,String term){
        int n = 0;
        if(term != null && !"".equals(term.trim())){
            for(List<String> doc : docs){
                for(String str : doc){
                    if(term.equalsIgnoreCase(str)){
                        n++;
                        break;
                    }
                }
            }
        }else{
            System.out.println("term不能null或者空串");
        }
        return n;
    }

    /**
     * 统计逆文档频率idf
     * @param docs 文档集
     * @param term 词项
     * @return 逆文档频率idf
     */
    public double idf(List<List<String>> docs,String term){
        return Math.log(docs.size() / (double)df(docs,term) + 1);
    }

    /**
     * 统计词频-逆文档频率tf-idf
     * @param doc 文档词项集合
     * @param docs 文档集
     * @param term 词项
     * @return 词频-逆文档频率tf-idf
     */
    public double tfIdf(List<String> doc,List<List<String>> docs,String term){
        return tf(doc,term) * idf(docs, term);
    }
}
